import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import scipy.stats as st
from sklearn import ensemble, tree, linear_model
import missingno as msno
Read the AB_NYC_2019.csv file
AirBnB = pd.read_csv("AB_NYC_2019.csv")
Describe the dataset
AirBnB.describe()
| id | host_id | latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.889500e+04 | 4.889500e+04 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 38843.000000 | 48895.000000 | 48895.000000 |
| mean | 1.901714e+07 | 6.762001e+07 | 40.728949 | -73.952170 | 152.720687 | 7.029962 | 23.274466 | 1.373221 | 7.143982 | 112.781327 |
| std | 1.098311e+07 | 7.861097e+07 | 0.054530 | 0.046157 | 240.154170 | 20.510550 | 44.550582 | 1.680442 | 32.952519 | 131.622289 |
| min | 2.539000e+03 | 2.438000e+03 | 40.499790 | -74.244420 | 0.000000 | 1.000000 | 0.000000 | 0.010000 | 1.000000 | 0.000000 |
| 25% | 9.471945e+06 | 7.822033e+06 | 40.690100 | -73.983070 | 69.000000 | 1.000000 | 1.000000 | 0.190000 | 1.000000 | 0.000000 |
| 50% | 1.967728e+07 | 3.079382e+07 | 40.723070 | -73.955680 | 106.000000 | 3.000000 | 5.000000 | 0.720000 | 1.000000 | 45.000000 |
| 75% | 2.915218e+07 | 1.074344e+08 | 40.763115 | -73.936275 | 175.000000 | 5.000000 | 24.000000 | 2.020000 | 2.000000 | 227.000000 |
| max | 3.648724e+07 | 2.743213e+08 | 40.913060 | -73.712990 | 10000.000000 | 1250.000000 | 629.000000 | 58.500000 | 327.000000 | 365.000000 |
Look at the top and bottom 4 rows of data and look at the shape of the data frame (number of rows and columns)
AirBnB.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
AirBnB.tail()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48890 | 36484665 | Charming one bedroom - newly renovated rowhouse | 8232441 | Sabrina | Brooklyn | Bedford-Stuyvesant | 40.67853 | -73.94995 | Private room | 70 | 2 | 0 | NaN | NaN | 2 | 9 |
| 48891 | 36485057 | Affordable room in Bushwick/East Williamsburg | 6570630 | Marisol | Brooklyn | Bushwick | 40.70184 | -73.93317 | Private room | 40 | 4 | 0 | NaN | NaN | 2 | 36 |
| 48892 | 36485431 | Sunny Studio at Historical Neighborhood | 23492952 | Ilgar & Aysel | Manhattan | Harlem | 40.81475 | -73.94867 | Entire home/apt | 115 | 10 | 0 | NaN | NaN | 1 | 27 |
| 48893 | 36485609 | 43rd St. Time Square-cozy single bed | 30985759 | Taz | Manhattan | Hell's Kitchen | 40.75751 | -73.99112 | Shared room | 55 | 1 | 0 | NaN | NaN | 6 | 2 |
| 48894 | 36487245 | Trendy duplex in the very heart of Hell's Kitchen | 68119814 | Christophe | Manhattan | Hell's Kitchen | 40.76404 | -73.98933 | Private room | 90 | 7 | 0 | NaN | NaN | 1 | 23 |
AirBnB.shape
(48895, 16)
Examine numerical features in the dataset
numeric_features = AirBnB.select_dtypes(include=[np.number])
numeric_features.columns
Index(['id', 'host_id', 'latitude', 'longitude', 'price', 'minimum_nights',
'number_of_reviews', 'reviews_per_month',
'calculated_host_listings_count', 'availability_365'],
dtype='object')
Examine categorical features in the dataset
categorical_features = AirBnB.select_dtypes(include=[object])
categorical_features.columns
Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood',
'room_type', 'last_review'],
dtype='object')
Estimate Skewness and Kurtosis
AirBnB.skew()
C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\2556107286.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. AirBnB.skew()
id -0.090257 host_id 1.206214 latitude 0.237167 longitude 1.284210 price 19.118939 minimum_nights 21.827275 number_of_reviews 3.690635 reviews_per_month 3.130189 calculated_host_listings_count 7.933174 availability_365 0.763408 dtype: float64
AirBnB.kurt()
C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\772064313.py:1: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. AirBnB.kurt()
id -1.227748 host_id 0.169106 latitude 0.148845 longitude 5.021646 price 585.672879 minimum_nights 854.071662 number_of_reviews 19.529788 reviews_per_month 42.493469 calculated_host_listings_count 67.550888 availability_365 -0.997534 dtype: float64
Now check to see if price is normaly distributed
y = AirBnB['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\1996513134.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.johnsonsu) C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\1996513134.py:5: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.norm) C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\1996513134.py:7: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.lognorm)
<Axes: title={'center': 'Log Normal'}, xlabel='price'>
We can see that price is heavily skewed. From the five figure summary previously obtained we know the range is 0-1000 with Q3 being 174. So 75 percent of the properties fall between 0 and 175. Check the distribution of price for only properties below 175 (75 percent of the sample)
Find the five figure summary for price so we can plot the middle 50%
AirBnB.price.describe()
count 48895.000000 mean 152.720687 std 240.154170 min 0.000000 25% 69.000000 50% 106.000000 75% 175.000000 max 10000.000000 Name: price, dtype: float64
low_price = AirBnB[AirBnB['price'] < 175]
y = low_price['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\1939400500.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.johnsonsu) C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\1939400500.py:6: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.norm) C:\Users\leigh\AppData\Local\Temp\ipykernel_20520\1939400500.py:8: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(y, kde=False, fit=st.lognorm)
<Axes: title={'center': 'Log Normal'}, xlabel='price'>
There is still a lot of variation, but the distribution is relatively normal, especialy using the raw numbers (not transformed).
Show number of missing values per variable
AirBnB.isna().sum()
id 0 name 16 host_id 0 host_name 21 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 10052 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
Replace null values with appropriate values:
AirBnB['name'].fillna('Replaced name', inplace=True)
AirBnB['host_name'].fillna('Replaced host name', inplace=True)
AirBnB['last_review'].fillna(AirBnB['last_review'].mode().iloc[0], inplace=True)
AirBnB['reviews_per_month'].fillna(AirBnB['reviews_per_month'].mean(), inplace=True)
Now re-check to make sure there are no missing values
AirBnB.isna().sum()
id 0 name 0 host_id 0 host_name 0 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 0 reviews_per_month 0 calculated_host_listings_count 0 availability_365 0 dtype: int64
Correlation matrix with price
correlation = numeric_features.corr()
correlation['price'].sort_values(ascending = False)
price 1.000000 availability_365 0.081829 calculated_host_listings_count 0.057472 minimum_nights 0.042799 latitude 0.033939 host_id 0.015309 id 0.010619 reviews_per_month -0.030608 number_of_reviews -0.047954 longitude -0.150019 Name: price, dtype: float64
To explore further we will start with the following visualisation methods to analyze the data better:
f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of Numeric Features',y=1,size=16)
sns.heatmap(correlation,square = True)
<Axes: title={'center': 'Correlation of Numeric Features'}>
The only strong correlations appear to be number_of_reviews with id, which is not likely to be helpful, and number_of_review with reviews_per_month.
k= 11
cols = correlation.nlargest(k,'price')['price'].index
print(cols)
cm = np.corrcoef(AirBnB[cols].values.T)
f , ax = plt.subplots(figsize = (14,12))
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
linecolor="white",xticklabels = cols.values ,annot_kws = {'size':12},yticklabels = cols.values)
Index(['price', 'availability_365', 'calculated_host_listings_count',
'minimum_nights', 'latitude', 'host_id', 'id', 'reviews_per_month',
'number_of_reviews', 'longitude'],
dtype='object')
<Axes: >
Price isn't highly correlated with any variable.
Visualisation of 'OverallQual','TotalBsmtSF','GrLivArea','GarageArea','FullBath','YearBuilt','YearRemodAdd' features with respect to SalePrice in the form of pair plot & scatter pair plot for better understanding.
sns.set()
columns = ['price','latitude','longitude','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count']
sns.pairplot(AirBnB[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()
C:\Users\leigh\anaconda3\lib\site-packages\seaborn\axisgrid.py:2095: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
Unsuprisingly the scatterplot for longitude and latitide align visually with the map of New York, so we'll overlay onto the map later.
Scatter plots between price, longitude and latitude
fig, ((ax1), (ax2),(ax3)) = plt.subplots(nrows=3, ncols=1, figsize=(14,10))
price_longitude_scatter_plot = pd.concat([AirBnB['price'],AirBnB['longitude']],axis = 1)
sns.regplot(x='longitude',y = 'price',data = price_longitude_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
price_latitude_scatter_plot = pd.concat([AirBnB['price'],AirBnB['latitude']],axis = 1)
sns.regplot(x='latitude',y = 'price',data = price_latitude_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
latitude_longitude_scatter_plot = pd.concat([AirBnB['latitude'],AirBnB['longitude']],axis = 1)
sns.regplot(x='longitude',y = 'latitude',data = latitude_longitude_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
<Axes: xlabel='longitude', ylabel='latitude'>
Still very little to show correlating price with location.
Bar chart of median price by neighborhood group
neighbourhood_group = AirBnB.pivot_table(index ='neighbourhood_group',values = 'price', aggfunc = np.median)
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood group')
plt.ylabel('Median price')
Text(0, 0.5, 'Median price')
We can see that Manhattan has the highest median price by some distance.
Bar chart of mean price by neighborhood group
neighbourhood_group = AirBnB.pivot_table(index ='neighbourhood_group',values = 'price', aggfunc = np.mean)
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood group')
plt.ylabel('Mean price')
Text(0, 0.5, 'Mean price')
Manhattan also has the highest mean price
Box plot price by neighborhood group
var = 'neighbourhood_group'
data = pd.concat([AirBnB['price'], AirBnB[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="price", data=data)
fig.axis(ymin=0, ymax=500);
We can see again that Manhattan has the highest prices, illustrating that all figures in the five figure summary are higher than any other neighrohood group.
Bar chart of number of properties per neighborhood group
neighbourhood_group = AirBnB.pivot_table(index ='neighbourhood_group',values = 'id', aggfunc = 'count')
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood group')
plt.ylabel('Number of properties')
Text(0, 0.5, 'Number of properties')
We can see that Manhattan has the most roperties, closely follwed by Brooklyn. So Manhattan has the most properties, and they are the most expensive.
Break-down Manhattan with a box plot of price by neighborhood in Manhattan.
Manhattan = AirBnB[AirBnB['neighbourhood_group'] == 'Manhattan']
var = 'neighbourhood'
data = pd.concat([Manhattan['price'], Manhattan[var]], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
fig = sns.boxplot(x=var, y="price", data=data)
fig.axis(ymin=0, ymax=1000);
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
[Text(0, 0, 'Midtown'),
Text(1, 0, 'Harlem'),
Text(2, 0, 'East Harlem'),
Text(3, 0, 'Murray Hill'),
Text(4, 0, "Hell's Kitchen"),
Text(5, 0, 'Upper West Side'),
Text(6, 0, 'Chinatown'),
Text(7, 0, 'West Village'),
Text(8, 0, 'Chelsea'),
Text(9, 0, 'Inwood'),
Text(10, 0, 'East Village'),
Text(11, 0, 'Lower East Side'),
Text(12, 0, 'Kips Bay'),
Text(13, 0, 'SoHo'),
Text(14, 0, 'Upper East Side'),
Text(15, 0, 'Washington Heights'),
Text(16, 0, 'Financial District'),
Text(17, 0, 'Morningside Heights'),
Text(18, 0, 'NoHo'),
Text(19, 0, 'Flatiron District'),
Text(20, 0, 'Roosevelt Island'),
Text(21, 0, 'Greenwich Village'),
Text(22, 0, 'Little Italy'),
Text(23, 0, 'Two Bridges'),
Text(24, 0, 'Nolita'),
Text(25, 0, 'Gramercy'),
Text(26, 0, 'Theater District'),
Text(27, 0, 'Tribeca'),
Text(28, 0, 'Battery Park City'),
Text(29, 0, 'Civic Center'),
Text(30, 0, 'Stuyvesant Town'),
Text(31, 0, 'Marble Hill')])
We can see that within Manhattan there is a lot of price variation between the neighborhoods.
Bar chart of number of properties per neighborhood in Manhattan
neighbourhood_group = Manhattan.pivot_table(index ='neighbourhood',values = 'id', aggfunc = 'count')
neighbourhood_group.plot(kind = 'bar',color = 'blue')
plt.xlabel('Neighborhood')
plt.ylabel('Number of properties')
Text(0, 0.5, 'Number of properties')
We can also see a large variation between the number of properties listed in each neighborhood.
At first glance it apears that neighborhoods with the higher prices have fewer properties, but more detailed analysis would be needed in this areas.
Plot the properties on the map, grouped by neighbourhood_group, to visualise the distribution.
plt.figure(figsize=(12,8))
plt.style.use('fast')
# Set the boundary of the map using longitude and latitude obtained from Google Maps
coordinates = (-74.2623, -73.6862, 40.4943, 40.9144)
map = mpimg.imread("New_York_City.jpg")
plt.imshow(map,extent=coordinates)
groups = AirBnB.groupby('neighbourhood_group')
for name,group in groups :
plt.scatter(group['longitude'],group['latitude'], label=name, edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties by neighborhood')
plt.legend()
<matplotlib.legend.Legend at 0x23c20f85f60>
Now plot the properties using a colour gradiant for price
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
plt.scatter(AirBnB.longitude, AirBnB.latitude, c=AirBnB.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x23c2d623e80>
The wide price range makes this view unhelpful. We know the range is 0-10,000 but 75% of the properties are below 175, so the vast majority of the properties will have tiny colour variations at the same end of the colour range.
We'll take a look at the higher value properties first; plot only those over 1000
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
high_price = AirBnB[AirBnB['price'] > 1000]
plt.scatter(high_price.longitude, high_price.latitude, c=high_price.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties over $1000 by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x23c306e6170>
We can start to see where the very expensive properties are located, which is Manhatten as expected from the earlier analysis of price by neighborhood_region, but also Queens which was unexpected.
Now to see the price distribution of the lower cost properties. We know that 75% of properties are priced below 175 so plot only those under 175
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
low_price = AirBnB[AirBnB['price'] < 175]
plt.scatter(low_price.longitude, low_price.latitude, c=low_price.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties under $175 by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x23c2ec5f970>
Once again the higher priced properties at the lower end of the price range are clustered in Manhatten.
Now to see the price distribution of the middle 50% of properties by cost, plot only those >= 69 and <= 175
plt.figure(figsize=(12,8))
plt.imshow(map, extent=coordinates)
mid_price = AirBnB[AirBnB['price'] >= 69]
mid_price = mid_price[mid_price['price'] <= 175]
plt.scatter(mid_price.longitude, mid_price.latitude, c=mid_price.price, cmap='rainbow', edgecolors='black')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Properties >= $69 and <= $175 by price')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x23c2c588130>
In the mid-price range the higher prices are again clusted in Manhatten, but they are distributed across the other regions too.